{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 06 Two-way (column or row) relative frequency tables"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%html\n",
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import plotly.graph_objects as go\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"\n",
"findspark.init()\n",
"from pyspark.context import SparkContext\n",
"from pyspark.sql import functions as F\n",
"from pyspark.sql.session import SparkSession\n",
"\n",
"spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/stats-two-way-tables/v/two-way-relative-frequency-tables?modal=1)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"dataset = {\n",
" \"Car\": 28 * [\"SUV\"] + 35 * [\"Sport car\"] + 97 * [\"SUV\"] + 104 * [\"Sport car\"],\n",
" \"Accident\": 28 * [\"yes\"] + 35 * [\"yes\"] + 97 * [\"no\"] + 104 * [\"no\"],\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Car | \n",
" Accident | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" SUV | \n",
" yes | \n",
"
\n",
" \n",
" 1 | \n",
" SUV | \n",
" yes | \n",
"
\n",
" \n",
" 2 | \n",
" SUV | \n",
" yes | \n",
"
\n",
" \n",
" 3 | \n",
" SUV | \n",
" yes | \n",
"
\n",
" \n",
" 4 | \n",
" SUV | \n",
" yes | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 259 | \n",
" Sport car | \n",
" no | \n",
"
\n",
" \n",
" 260 | \n",
" Sport car | \n",
" no | \n",
"
\n",
" \n",
" 261 | \n",
" Sport car | \n",
" no | \n",
"
\n",
" \n",
" 262 | \n",
" Sport car | \n",
" no | \n",
"
\n",
" \n",
" 263 | \n",
" Sport car | \n",
" no | \n",
"
\n",
" \n",
"
\n",
"
264 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Car Accident\n",
"0 SUV yes\n",
"1 SUV yes\n",
"2 SUV yes\n",
"3 SUV yes\n",
"4 SUV yes\n",
".. ... ...\n",
"259 Sport car no\n",
"260 Sport car no\n",
"261 Sport car no\n",
"262 Sport car no\n",
"263 Sport car no\n",
"\n",
"[264 rows x 2 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(dataset)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+--------+\n",
"|Car|Accident|\n",
"+---+--------+\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"|SUV| yes|\n",
"+---+--------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n",
"sdf.registerTempTable(\"sdf_table\")\n",
"sdf.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Accident Car \n",
"no SUV 97\n",
" Sport car 104\n",
"yes SUV 28\n",
" Sport car 35\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby([\"Accident\", \"Car\"]).size()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---------+-----+\n",
"|Accident| Car|count|\n",
"+--------+---------+-----+\n",
"| no| SUV| 97|\n",
"| yes|Sport car| 35|\n",
"| no|Sport car| 104|\n",
"| yes| SUV| 28|\n",
"+--------+---------+-----+\n",
"\n"
]
}
],
"source": [
"sdf.groupby(\"Accident\", \"Car\").count().show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---------+-----+\n",
"|Accident| Car|count|\n",
"+--------+---------+-----+\n",
"| no| SUV| 97|\n",
"| yes|Sport car| 35|\n",
"| no|Sport car| 104|\n",
"| yes| SUV| 28|\n",
"+--------+---------+-----+\n",
"\n"
]
}
],
"source": [
"spark.sql(\n",
" \"select Accident, Car, count(*) as count from sdf_table group by Accident, Car\"\n",
").show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Car | \n",
" SUV | \n",
" Sport car | \n",
"
\n",
" \n",
" Accident | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" no | \n",
" 97 | \n",
" 104 | \n",
"
\n",
" \n",
" yes | \n",
" 28 | \n",
" 35 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Car SUV Sport car\n",
"Accident \n",
"no 97 104\n",
"yes 28 35"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"two_way_table = pd.crosstab(df[\"Accident\"], df[\"Car\"])\n",
"two_way_table"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+---+---------+\n",
"|Accident_Car|SUV|Sport car|\n",
"+------------+---+---------+\n",
"| yes| 28| 35|\n",
"| no| 97| 104|\n",
"+------------+---+---------+\n",
"\n"
]
}
],
"source": [
"s_two_way_table = sdf.crosstab(\"Accident\", \"Car\")\n",
"s_two_way_table.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Car | \n",
" SUV | \n",
" Sport car | \n",
"
\n",
" \n",
" Accident | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" no | \n",
" 0.776 | \n",
" 0.748201 | \n",
"
\n",
" \n",
" yes | \n",
" 0.224 | \n",
" 0.251799 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Car SUV Sport car\n",
"Accident \n",
"no 0.776 0.748201\n",
"yes 0.224 0.251799"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freq_table = two_way_table.copy()\n",
"freq_table[\"SUV\"] = two_way_table[\"SUV\"] / two_way_table[\"SUV\"].sum()\n",
"freq_table[\"Sport car\"] = two_way_table[\"Sport car\"] / two_way_table[\"Sport car\"].sum()\n",
"freq_table"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+-----+------------------+\n",
"|Accident_Car| SUV| Sport car|\n",
"+------------+-----+------------------+\n",
"| yes|0.224|0.2517985611510791|\n",
"| no|0.776|0.7482014388489209|\n",
"+------------+-----+------------------+\n",
"\n"
]
}
],
"source": [
"s_freq_table = s_two_way_table\n",
"s_freq_table = s_freq_table.withColumn(\n",
" \"SUV\", F.col(\"SUV\") / s_freq_table.select(F.sum(\"SUV\")).collect()[0][0]\n",
")\n",
"s_freq_table = s_freq_table.withColumn(\n",
" \"Sport car\",\n",
" F.col(\"Sport car\") / s_freq_table.select(F.sum(\"Sport car\")).collect()[0][0],\n",
")\n",
"s_freq_table.show()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Car | \n",
" SUV | \n",
" Sport car | \n",
"
\n",
" \n",
" Accident | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" no | \n",
" 0.776 | \n",
" 0.748201 | \n",
"
\n",
" \n",
" yes | \n",
" 0.224 | \n",
" 0.251799 | \n",
"
\n",
" \n",
" Total | \n",
" 1.000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Car SUV Sport car\n",
"Accident \n",
"no 0.776 0.748201\n",
"yes 0.224 0.251799\n",
"Total 1.000 1.000000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freq_table = freq_table.append(freq_table.sum().rename(\"Total\"))\n",
"freq_table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}